This tutorial describe classical analysis of single-cell RNA seq data. We will use the Rstudio server of IFB: https://rstudio.cluster.france-bioinformatique.fr We will work with public data of HSPCs generated with DropSeq protocol (Rodriguez-Fraticelli et al. 2018). First we will follow the very popular Seurat workflow (Stuart et al. 2019) available here: https://satijalab.org/seurat/v3.1/pbmc3k_tutorial.html
load.fun <- function(x) {
x <- as.character(substitute(x))
if(isTRUE(x %in% .packages(all.available=TRUE))) {
eval(parse(text=paste("require(", x, ")", sep="")))
} else {
eval(parse(text=paste("install.packages('", x, "', repos = 'http://cran.us.r-project.org')", sep="")))
eval(parse(text=paste("require(", x, ")", sep="")))
}
}
load.bioc <- function(x) {
x <- as.character(substitute(x))
if(isTRUE(x %in% .packages(all.available=TRUE))) {
eval(parse(text=paste("require(", x, ")", sep="")))
} else {
eval(parse(text="source('http://bioconductor.org/biocLite.R')"))
eval(parse(text=paste("biocLite('", x, "')", sep="")))
}
}
suppressMessages(load.fun("Seurat"))
suppressMessages(load.fun("gProfileR"))
suppressMessages(load.bioc("scran"))
## Warning: replacing previous import 'BiocGenerics::dims' by 'Biobase::dims'
## when loading 'SummarizedExperiment'
## Warning: replacing previous import 'Biobase::dims' by 'DelayedArray::dims'
## when loading 'SummarizedExperiment'
# Installation of monocle version 2 Be careful a version 3 is available now
suppressMessages(load.fun("VGAM"))
suppressMessages(load.fun("DDRTree"))
suppressMessages(load.bioc("HSMMSingleCell"))
suppressMessages(load.fun("combinat"))
suppressMessages(load.fun("fastICA"))
suppressMessages(load.fun("densityClust"))
suppressMessages(load.fun("qlcMatrix"))
suppressMessages(load.fun("proxy"))
suppressMessages(load.fun("slam"))
suppressMessages(load.bioc("biocViews"))
path <- "https://bioconductor.org/packages/release/bioc/src/contrib/monocle_2.14.0.tar.gz"
suppressMessages(install.packages(path, repos=NULL, type="source"))
library(Seurat)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks Matrix::expand(), S4Vectors::expand()
## ✖ tidyr::fill() masks VGAM::fill()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ purrr::simplify() masks DelayedArray::simplify()
## ✖ dplyr::slice() masks IRanges::slice()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(gProfileR)
library(scran)
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
library(RColorBrewer)
LTHSC <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2411nnn/GSM2411664/suppl/GSM2411664_LTHSC.raw_umifm_counts.csv.gz"
STHSC <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2411nnn/GSM2411668/suppl/GSM2411668_STHSC.raw_umifm_counts.csv.gz"
MPP2 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2411nnn/GSM2411665/suppl/GSM2411665_MPP2.raw_umifm_counts.csv.gz"
MPP3 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2411nnn/GSM2411666/suppl/GSM2411666_MPP3.raw_umifm_counts.csv.gz"
MPP4 <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2411nnn/GSM2411667/suppl/GSM2411667_MPP4.raw_umifm_counts.csv.gz"
rodriguezDataUrl <- list(LTHSC,STHSC,MPP2,MPP3)
downloadRodriguez <- function(url) {
if(!file.exists(paste0("../RodriguezData/",basename(url)))) {
dir.create("../RodriguezData/",showWarnings = F)
download.file(url, destfile = paste0("../RodriguezData/",basename(url)), method = "wget")
}
}
lapply(rodriguezDataUrl, downloadRodriguez)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
files <- list.files("../RodriguezData/",full.names = T)
datasets <- lapply(files, read_csv)
## Parsed with column specification:
## cols(
## .default = col_double(),
## cell_id = col_character(),
## barcode = col_character(),
## library_id = col_character(),
## seq_run_id = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## cell_id = col_character(),
## barcode = col_character(),
## library_id = col_character(),
## seq_run_id = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## cell_id = col_character(),
## barcode = col_character(),
## library_id = col_character(),
## seq_run_id = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_double(),
## cell_id = col_character(),
## barcode = col_character(),
## library_id = col_character(),
## seq_run_id = col_character()
## )
## See spec(...) for full column specifications.
rodriguezData <- rbind(datasets[[1]],datasets[[2]],datasets[[3]],datasets[[4]])
rodriguezData[c(1:10),c(1:10)]
## # A tibble: 10 x 10
## cell_id barcode library_id seq_run_id pass_filter `0610007P14Rik`
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 LTHSC1 TGACTT… LTHSC seq_run1 0 3
## 2 LTHSC2 GGATAA… LTHSC seq_run1 0 4
## 3 LTHSC3 AAAAGT… LTHSC seq_run1 0 1
## 4 LTHSC4 TGATGT… LTHSC seq_run1 0 4
## 5 LTHSC5 TAGCCT… LTHSC seq_run1 0 0
## 6 LTHSC6 GAAGGA… LTHSC seq_run1 0 0
## 7 LTHSC7 GAATCA… LTHSC seq_run1 0 0
## 8 LTHSC8 TGATTC… LTHSC seq_run1 0 1
## 9 LTHSC9 AAGCTA… LTHSC seq_run1 0 0
## 10 LTHSC10 ATGTGT… LTHSC seq_run1 0 1
## # … with 4 more variables: `0610009B22Rik` <dbl>, `0610009L18Rik` <dbl>,
## # `0610009O20Rik` <dbl>, `0610010F05Rik` <dbl>
Now we can create the seurat object using the CreateSeuratObject function.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning in if (class(x = meta.data) == "data.frame") {: the condition has
## length > 1 and only the first element will be used
## An object of class Seurat
## 28205 features across 3925 samples within 1 assay
## Active assay: RNA (28205 features)
Phenotypic data of the cells are stored in the meta.data slot. We can access row count using the GetAssayData function, the number of genes and cells with the nrow and ncol functions respectively.
head(seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA cell_id barcode
## LTHSC1 SeuratProject 17065 4621 LTHSC1 TGACTTCTGGA-ATACCCAG
## LTHSC2 SeuratProject 13445 4486 LTHSC2 GGATAAAG-GTCCGTAC
## LTHSC3 SeuratProject 9514 3651 LTHSC3 AAAAGTCGG-GTTAACCA
## LTHSC4 SeuratProject 13428 4060 LTHSC4 TGATGTCTTTC-AAACTCGA
## LTHSC5 SeuratProject 10544 3681 LTHSC5 TAGCCTCG-TCCAGGGA
## LTHSC6 SeuratProject 9443 3406 LTHSC6 GAAGGAAGAC-GACAAAGG
## library_id seq_run_id pass_filter
## LTHSC1 LTHSC seq_run1 0
## LTHSC2 LTHSC seq_run1 0
## LTHSC3 LTHSC seq_run1 0
## LTHSC4 LTHSC seq_run1 0
## LTHSC5 LTHSC seq_run1 0
## LTHSC6 LTHSC seq_run1 0
GetAssayData(seurat)[c(1:5),c(1:5)]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## LTHSC1 LTHSC2 LTHSC3 LTHSC4 LTHSC5
## 0610007P14Rik 3 4 1 4 .
## 0610009B22Rik . . . . .
## 0610009L18Rik . 1 . . .
## 0610009O20Rik . . . 1 .
## 0610010F05Rik . . . . .
nrow(seurat)
## [1] 28205
ncol(seurat)
## [1] 3925
seurat@meta.data$library_id <- factor(seurat@meta.data$library_id, levels = c("LTHSC","STHSC","MPP2","MPP3"))
colorCellType <- c(brewer.pal(4,"Set2"))[c(1,3,4,2)]
Before the analysis of the dataset, we need to filter out poor quality cells such as doublet, lysed and dying/stresed cells. Dying and highly stressed cells can be identified by their high expression of mitochondrial genes. We calculate mitochondrial QC metrics with the PercentageFeatureSet function, which calculates the percentage of counts originating from a set of features. We use the set of all genes starting with MT- as a set of mitochondrial genes.
seurat$percent.mt <- PercentageFeatureSet(seurat, pattern = "^mt-")
Number of genes expressed by each cell as well as number of total UMI counted in each cell were calculated by seurat during the creation of the seurat object and are stored in the meta.data slot. We can now vizualize this different qc metrics and try to detect outliers that are poor quality cells with the VlnPLot function.
# Visualize QC metrics as a violin plot
VlnPlot(seurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),pt.size = 0.1, ncol = 3)
We can also use the FeatureScatter function to observe correlation between the qc metrics.
plot1 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2),ncol = 1)
In their study (Rodriguez-Fraticelli et al. 2018) Rodriguez et al filter out cells with few mRNA counts (< 1000 UMIs) and stressed cells (mitochondrial gene-set Z-score > 1). They stored the results of the filtering in the pass_filter column of their files that we stored in the meta.data of our object.
plot3 <- FeatureScatter(seurat, feature2 = "nCount_RNA", feature1 = "pass_filter")
plot4 <- FeatureScatter(seurat, feature2 = "percent.mt", feature1 = "pass_filter")
plot5 <- FeatureScatter(seurat, feature2 = "nFeature_RNA", feature1 = "pass_filter")
CombinePlots(plots = list(plot3,plot4, plot5))
We use their filtering to filter out the poor quality cells with the subset function.
seurat <- subset(x = seurat, subset = pass_filter > 0.1)
We can now check the results of the filtering like previously with the FeatureScatter function.
plot1 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2),ncol = 1)
A lot of genes are lowly expressed by the cells and detected in just few cells in the dataset. This can lead to unwanted noise in the analysis and longer time for the computing steps. To avoid it we keep only genes expressed in at least 0.01% percent of the remaining cells.
# First we create a data frame for genes information fdata
fdata <- data.frame(gene_short_name = rownames(seurat),num_cells_expressed = NA)
rownames(fdata) <- fdata$gene_short_name
# We compute the for each gene the number of cells that expressed it with the rowSums function
fdata$num_cells_expressed <- rowSums(as.matrix(GetAssayData(seurat)))
# we then compute the desired cut off
cutOffExpressingCells <- 0.001*ncol(seurat)
#We mark genes that are above the cut off
fdata$kept_genes <- fdata$num_cells_expressed > cutOffExpressingCells
# We finally update our seurat objetc removing the non expressed genes using the CreateSeuratObject and GetAssayData functions
seurat <- CreateSeuratObject(counts = GetAssayData(seurat)[which(fdata$kept_genes),],meta.data = seurat@meta.data)
Once unwanted cells are removed we normalized the data. We use the standard Seurat normalisation. It employs a global-scaling normalization method ???LogNormalize??? that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. Normalized values are stored in seurat[[“RNA”]](???).
seurat <- NormalizeData(seurat, normalization.method = "LogNormalize", scale.factor = 10000)
The seurat workflow starts with the selection of Highly variable genes that will be used for dimensionnal reduction and for the clustering. By taking only the top 2000 highly variable genes we considerabely decrease the time of the anlysis.
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(seurat), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(seurat)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
CombinePlots(plots = list(plot1, plot2),ncol = 1)
seurat <- ScaleData(seurat)
## Centering and scaling data matrix
PCA main ideas https://www.youtube.com/watch?v=HMOI_lkzW08 PCA step by steps https://www.youtube.com/watch?v=FgakZw6K1QQ&t=8s We use the RunPCA function to do the PCA on the highly variable genes detected before.
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
## PC_ 1
## Positive: Mzb1, Jchain, Tff1, Pou2af1, Ly6d, Lrrc4, Gm4857, Xirp1, Kcna4, Derl3
## Chst1, Ccr10, Prg2, Olfr1443, Moap1, Tbc1d8b, Rnf170, 4930402H24Rik, Cd79a, Gm15153
## Oosp1, Pitx1, Gm17305, Rfx3, E430018J23Rik, Gm5776, Fcrla, Flnc, Slc7a14, Dnajc11
## Negative: Ptma, Stmn1, Ppia, Car2, H2afy, Tuba1b, Gm13092, Tubb5, Hdgf, Hmgb2
## Ybx3, Mcm7, Dut, H2afz, Hlf, Myc, Lgals9, Nucks1, Eif5a, Top2a
## H2afx, Cdca7, Pbx1, Marcksl1, Ifitm1, Gata2, Tsc22d1, Banf1, Mpl, Pfn1
## PC_ 2
## Positive: Hmgb2, Elane, Ctsg, Mpo, Clec12a, Hp, Ms4a3, 2810417H13Rik, Tyrobp, Top2a
## Hist1h2ap, C3, Plac8, Cks2, Ccna2, Chil3, Gsr, H2afz, H2afx, Ly6c2
## Ap3s1, Ube2c, F13a1, Cybb, Tmsb4x, Cdca8, Cks1b, Birc5, Fcgr3, Mki67
## Negative: Hlf, Ifitm1, Mpl, Tsc22d1, Car2, Tmem176b, Serpina3g, Mecom, Mycn, Jund
## Nkx2-3, B2m, Nr4a1, Zbtb20, Ppp1r15a, Rgs1, Cbx6, Mllt3, Nr4a2, Smarca2
## Sult1a1, Epb4.1l4b, Hes1, Hoxa5, Iigp1, Art4, Fos, Def8, Dusp2, Pbx1
## PC_ 3
## Positive: Tyrobp, Lyz2, Lgals3, Fcer1g, Mpeg1, Cybb, Ccl6, S100a4, Pld4, Plbd1
## Clec4a1, Cebpb, Fn1, Csf1r, Fgr, Chil3, Ccr2, C3, Fcgr4, Ctss
## Ifitm6, Ear2, Itgb2, S100a6, Psap, AB124611, Cyp4f18, Clec4a3, Lsp1, Ahnak
## Negative: Tubb5, Tuba1b, Stmn1, Top2a, Hdgf, H2afx, Ptma, Cks1b, Dut, Mki67
## Hist1h2ap, Ccna2, Hmgb2, H2afz, Cdca8, Ppia, Rrm1, Nucks1, Tk1, Cdk1
## Ncapd2, Gm13092, 2810417H13Rik, Ybx3, Pbk, Birc5, Mcm7, Cdca3, Ube2c, Cks2
## PC_ 4
## Positive: Ms4a2, Fcer1a, Prss34, Cpa3, Mcpt8, Hdc, Cd200r3, Ccl3, Cdh1, Alox5
## Cyp11a1, Rnase12, Gp49a, Osm, Csrp3, Ccl4, Tnfaip3, Egr1, Alox15, Cebpa
## Ier3, Cd69, Cyp4f18, Ets1, Junb, Slc7a8, Nfkbia, Jun, Plac8, Csf2rb
## Negative: Lyz2, Ifitm3, Fn1, S100a6, Mpeg1, Cybb, S100a4, Psap, Chil3, Tgfbi
## C3, Pld4, F13a1, Ifi30, I830127L07Rik, Car2, Gda, Ly6g, Ctss, Plbd1
## Mpl, Ifitm6, Ly6c2, Ly86, B2m, Gpr141, Csf1r, Unc93b1, Clec4a1, Lsp1
## PC_ 5
## Positive: Apoe, Csf1r, Car1, Clec4a1, Aqp1, Fcgr4, Ace, Pld4, Ccl6, Ear2
## Klf1, Plbd1, Ctss, Fabp4, Ifi30, Gata1, Clec4a3, Cd36, Lgals3, Mpeg1
## Ces2g, Crip1, Rhd, Fcer1g, S100a4, Blvrb, Cyp4f18, Apoc2, Ube2c, Ccna2
## Negative: Wfdc21, S100a8, Elane, Fcnb, S100a9, Ms4a3, Prtn3, Trem3, Mpo, Cldn15
## Ctsg, Camp, Cebpe, Prss57, Mgst2, Fam101b, Clec12a, Hp, Ngp, Gfi1
## Clec4a2, Lrg1, Gstm1, B4galt6, Anxa3, Sept5, Lta4h, Chst13, Rgcc, Lcn2
First we can compute the the proportion of variance explained by each principal componen of the PCA.
pca_var <- apply(seurat@reductions$pca@cell.embeddings,2,var)
prop_pca_var <- pca_var/sum(pca_var)
barplot(prop_pca_var[c(1:20)], ylab = "proportion variance explained",las = 2)
We can also use the Elbow plot function of Seurat
ElbowPlot(seurat)
We can the top genes for the first two principal components with the VizDimLoadings function.
VizDimLoadings(seurat, dims = 1:2, reduction = "pca")
We can plot the cells within the two first component PCA space.
DimPlot(seurat, reduction = "pca")
We can choose a meta.data column to color the cells using the group.by argument of the DimPlot function.
DimPlot(seurat, reduction = "pca",group.by = "library_id",cols = colorCellType)
We we will use in the significant PCs in the next step of the analysis. But before that we need to inspect these PCs to see if we have some cofounding factors (cellular stress, cell cylce, library depth). To this aim we cab do enrichment analysis for the top genes of these PCs using the gProfileR packages.
firstPC_genes <- seurat@reductions$pca@feature.loadings[,c(1:5)]
getTopGenes <- function(seurat, pc,ntop = 30) {
pc <- paste0("PC_",pc)
firstPC_genes <- seurat@reductions$pca@feature.loadings[,c(1:5)]
topFeatures <- firstPC_genes[order(abs(firstPC_genes[,pc]),decreasing = T)[c(1:ntop)],pc]
return(names(topFeatures))
}
topPC_features <- lapply(c(1:5), getTopGenes,seurat = seurat)
Enrichment Analysis: https://www.youtube.com/watch?v=udyAvvaMjfM
results <- gprofiler(topPC_features,organism = "mmusculus",custom_bg = rownames(seurat))
## Warning: Please consider using the new package "gprofiler2". At the moment you are using a deprecated package relying on outdated data.
## More information at https://biit.cs.ut.ee/gprofiler/page/r-new. Feel free to contact us at biit.support@ut.ee for further help.
head(results,n=20)
## query.number significant p.value term.size query.size overlap.size
## 1 2 TRUE 1.93e-02 25 29 3
## 2 2 TRUE 2.64e-02 498 29 7
## 3 2 TRUE 3.76e-02 346 29 6
## 4 2 TRUE 1.29e-02 22 29 3
## 5 2 TRUE 1.29e-02 22 29 3
## 6 2 TRUE 8.17e-03 19 29 3
## 7 2 TRUE 9.61e-03 20 29 3
## 8 2 TRUE 5.75e-03 17 29 3
## 9 2 TRUE 6.89e-03 18 29 3
## 10 2 TRUE 4.74e-03 16 29 3
## 11 2 TRUE 3.65e-02 977 29 9
## 12 2 TRUE 3.04e-02 717 29 8
## 13 2 TRUE 4.68e-03 556 29 8
## 14 2 TRUE 4.57e-02 542 29 7
## 15 2 TRUE 1.26e-03 192 29 6
## 16 2 TRUE 3.12e-02 194 29 5
## 17 2 TRUE 3.37e-02 4 29 2
## 18 2 TRUE 4.57e-02 64 20 3
## 19 2 TRUE 3.44e-02 138 20 4
## 20 2 TRUE 2.25e-05 106 20 6
## precision recall term.id domain subgraph.number
## 1 0.103 0.120 GO:0050832 BP 8
## 2 0.241 0.014 GO:0051301 BP 16
## 3 0.207 0.017 GO:0000280 BP 12
## 4 0.103 0.136 GO:0044110 BP 3
## 5 0.103 0.136 GO:0044116 BP 3
## 6 0.103 0.158 GO:0044144 BP 3
## 7 0.103 0.150 GO:0044117 BP 3
## 8 0.103 0.176 GO:0044126 BP 3
## 9 0.103 0.167 GO:0044146 BP 3
## 10 0.103 0.188 GO:0044130 BP 3
## 11 0.310 0.009 GO:0051276 BP 15
## 12 0.276 0.011 GO:0000278 BP 9
## 13 0.276 0.014 GO:1903047 BP 9
## 14 0.241 0.013 GO:0009617 BP 2
## 15 0.207 0.031 GO:0042742 BP 2
## 16 0.172 0.026 GO:0000793 CC 5
## 17 0.069 0.500 GO:0032133 CC 6
## 18 0.150 0.047 KEGG:05140 keg 10
## 19 0.200 0.029 KEGG:04217 keg 14
## 20 0.300 0.057 KEGG:05322 keg 7
## term.name
## 1 defense response to fungus
## 2 cell division
## 3 nuclear division
## 4 growth involved in symbiotic interaction
## 5 growth of symbiont involved in interaction with host
## 6 modulation of growth of symbiont involved in interaction with host
## 7 growth of symbiont in host
## 8 regulation of growth of symbiont in host
## 9 negative regulation of growth of symbiont involved in interaction with host
## 10 negative regulation of growth of symbiont in host
## 11 chromosome organization
## 12 mitotic cell cycle
## 13 mitotic cell cycle process
## 14 response to bacterium
## 15 defense response to bacterium
## 16 condensed chromosome
## 17 chromosome passenger complex
## 18 Leishmaniasis
## 19 Necroptosis
## 20 Systemic lupus erythematosus
## relative.depth intersection
## 1 1 MPO,ELANE,CTSG
## 2 1 UBE2C,BIRC5,TOP2A,CCNA2,CKS1B,CDCA8,CKS2
## 3 1 UBE2C,BIRC5,TOP2A,CDCA8,MKI67,CKS2
## 4 1 MPO,ELANE,CTSG
## 5 2 MPO,ELANE,CTSG
## 6 3 MPO,ELANE,CTSG
## 7 3 MPO,ELANE,CTSG
## 8 4 MPO,ELANE,CTSG
## 9 1 MPO,ELANE,CTSG
## 10 2 MPO,ELANE,CTSG
## 11 1 UBE2C,BIRC5,TOP2A,CCNA2,CDCA8,MKI67,H2AFZ,H2AFX,HMGB2
## 12 1 UBE2C,BIRC5,TOP2A,CCNA2,CKS1B,CDCA8,MKI67,CKS2
## 13 2 UBE2C,BIRC5,TOP2A,CCNA2,CKS1B,CDCA8,MKI67,CKS2
## 14 1 MPO,ELANE,C3,PLAC8,HP,CTSG,HMGB2
## 15 2 MPO,ELANE,PLAC8,HP,CTSG,HMGB2
## 16 1 BIRC5,TOP2A,MKI67,H2AFX,HMGB2
## 17 1 BIRC5,CDCA8
## 18 1 CYBB,C3,FCGR3
## 19 1 CYBB,H2AFZ,H2AFX,HIST1H2AP
## 20 1 ELANE,C3,H2AFZ,CTSG,H2AFX,HIST1H2AP
The PC 2 clearly reflect the cell cycle. We will analyse cell cylce in more detail with the cyclone function from the scran package. this function permits us to assign a cell cycle phase to each cell.
if(!file.exists("../cyclone/assignments.rds")) {
dir.create(path = "../cyclone",showWarnings = F)
gene_count_matrix <- as.matrix(GetAssayData(seurat))
set.seed(100)
mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
#in order to use the genes pairs provided we need to convert them from ensembl_id to gene_short_name
conversionTable <- gconvert(rownames(gene_count_matrix),organism = "mmusculus",target = "ENSG")
gene_count_matrix <- gene_count_matrix[conversionTable$alias.number,]
rownames(gene_count_matrix) <- conversionTable$target
#ensembl <- mapIds(org.Mm.eg.db, keys=rownames(gene_count_matrix), keytype="SYMBOL", column="ENSEMBL")
assignments <- cyclone(gene_count_matrix, mm.pairs, gene.names=rownames(gene_count_matrix))
saveRDS(assignments,"../cyclone/assignments.rds")
} else {
assignments <- readRDS("../cyclone/assignments.rds")
}
## add cell cycle phase to pData
#print(head(assignments))
seurat$phases <- assignments$phases
seurat$G1_score <- assignments$scores$G1
seurat$G2M_score <- assignments$scores$G2M
seurat$S_score <- assignments$scores$S
seurat$phases[which(seurat$phases=="G1")] <- "G1_G0"
seurat$phases[which(seurat$phases=="G2M")] <- "G2_M"
plot(assignments$score$G1, assignments$score$G2M,
xlab="G1 score", ylab="G2/M score", pch=16)
saveRDS(assignments,"cyclone/assignments.rds")
The cyclone analysis confirm that PC_2 reflect the cell cycle effect.
DimPlot(seurat,group.by = "phases")
plot(-seurat@reductions$pca@cell.embeddings[,"PC_2"],seurat$G2M_score)
We can correct for cell cycle effect biais in the scaling step and rerun the pca on the scale and now corrected data.
seurat <- ScaleData(seurat,vars.to.regress = c("G2M_score","S_score","G1_score"))
## Regressing out G2M_score, S_score, G1_score
## Centering and scaling data matrix
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
## PC_ 1
## Positive: Mzb1, Jchain, Tff1, Pou2af1, Ly6d, Lrrc4, Gm4857, Xirp1, Kcna4, Derl3
## Chst1, Ccr10, Prg2, Moap1, Olfr1443, Tbc1d8b, Rnf170, 4930402H24Rik, Cd79a, Gm15153
## Oosp1, Pitx1, Gm17305, Rfx3, E430018J23Rik, Gm5776, Fcrla, Flnc, Slc7a14, Dnajc11
## Negative: Ptma, Ppia, Stmn1, Car2, H2afy, Tuba1b, Gm13092, Hlf, Ifitm1, Tubb5
## Hdgf, Tsc22d1, Rgs1, Marcksl1, Myc, Ybx3, Mpl, Nr4a1, Gata2, Mcm7
## Pbx1, Lgals9, Hmgb2, Dut, Cdca7, Ifitm3, Eif5a, Srgn, Nr4a2, H2afz
## PC_ 2
## Positive: Tyrobp, Cybb, Lgals3, Mpeg1, Fcer1g, C3, Chil3, Hp, Lyz2, Fn1
## F13a1, Pld4, Clec12a, Tmsb4x, Fcgr3, Ccl6, Ccr2, S100a4, Msrb1, Plbd1
## Csf1r, Clec4a1, Pglyrp1, Cebpb, Ncf1, AB124611, Itgb2, Ifi30, Ly86, Fgr
## Negative: Car2, Tmem176b, Hlf, Tsc22d1, Mpl, Pbx1, Serpina3g, Cbx6, Krit1, Stmn1
## Tff1, Ifitm1, Gm4857, Chst1, Ppp1r15a, Mzb1, Pou2af1, Rfx3, Lrrc4, Xirp1
## Ly6d, Rnf170, Jchain, Tbc1d8b, E430018J23Rik, Mycn, Derl3, Egr1, Dnajc11, Gata2
## PC_ 3
## Positive: Klf2, Ifitm1, Hlf, Ifitm3, B2m, Nr4a1, Fcgr4, Psap, Lyz2, Zfp36
## S100a4, Clec4a1, Ctss, Lsp1, Plbd1, Ace, Ear2, Ccl6, Ypel3, Fos
## Csf1r, Fabp4, Junb, Clec4a3, Pld4, Rras, Fgr, Lgals3, Mpl, Ahnak
## Negative: Dut, Mpo, Tuba1b, Ptma, Ctsg, 2810417H13Rik, H2afz, Hmgb2, Elane, Ms4a3
## Mcm7, Tubb5, Top2a, Stmn1, Ybx3, Cks1b, Gm13092, Ppia, Rrm1, Hdgf
## Hist1h2ap, Tk1, Rrm2, Calr, H2afx, Ap3s1, Sdf2l1, Gatm, Banf1, Eif5a
## PC_ 4
## Positive: Ms4a2, Fcer1a, Prss34, Mcpt8, Cpa3, Cd200r3, Hdc, Ccl3, Cdh1, Alox5
## Cyp11a1, Gp49a, Rnase12, Csrp3, Osm, Tnfaip3, Ccl4, Alox15, Cyp4f18, Cebpa
## Ets1, Cd69, Nfkbia, Egr1, Ier3, Slc7a8, Cd7, Junb, Csf2rb, Emilin2
## Negative: Lyz2, Ifitm3, Fn1, S100a6, Chil3, Mpeg1, Psap, S100a4, C3, Mpl
## Cybb, F13a1, Tgfbi, I830127L07Rik, Gda, Ly6g, Ly6c2, B2m, Car2, Ifi30
## Hlf, Unc93b1, Pld4, Anxa5, Hp, Fkbp1a, Gpr141, Ctss, Cd177, Plcg2
## PC_ 5
## Positive: Car1, Aqp1, Klf1, Rhd, Ces2g, Mt2, Atp1b2, Gata1, Apoe, Blvrb
## Csf1r, Clec4a1, Pld4, Ifi30, Fcgr4, Ace, Ermap, Ear2, Plbd1, Fabp4
## Ctss, Sept8, Mpeg1, Crip1, Cd36, Clec4a3, Slc25a21, Hspa1b, Ybx3, Apoc2
## Negative: Wfdc21, S100a8, Fcnb, S100a9, Elane, Prtn3, Ms4a3, Cldn15, Trem3, Camp
## Cebpe, Mpo, Mgst2, Ifitm1, Gstm1, Ctsg, Hlf, Fam101b, Ngp, Gfi1
## Prss57, B4galt6, Lrg1, Clec12a, Lcn2, Sept5, Clec4a2, Hp, Srgn, Anxa1
VizDimLoadings(seurat, dims = 1:2, reduction = "pca")
DimPlot(seurat,group.by = "phases")
DimPlot(seurat,group.by = "library_id")
DimPlot(seurat,group.by = "library_id",dims = c(2,3),cols = colorCellType)
DimHeatmap allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and genes are ordered according to their PCA scores.
DimHeatmap(seurat, dims = 1:15, cells = 2000, balanced = TRUE)
We choose the PCs to use according to the elbow plot.
ElbowPlot(seurat)
From Seurat tutorial: Seurat v3 applies a graph-based clustering approach,Seura first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the FindNeighbors function, and takes as input the previously defined dimensionality of the dataset (first 13 PCs).
To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default), to iteratively group cells together, with the goal of optimizing the standard modularity function (fraction of the edges that fall within the given groups minus the expected fraction if edges were distributed at random). The FindClusters function implements this procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the Idents function.
seurat <- FindNeighbors(seurat, dims = 1:13)
## Computing nearest neighbor graph
## Computing SNN
seurat <- FindClusters(seurat, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3283
## Number of edges: 108509
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8171
## Number of communities: 12
## Elapsed time: 0 seconds
From Seurat tutorial: Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. As input to the UMAP and tSNE, we suggest using the same PCs as input to the clustering analysis.
A video about t-SNE: https://www.youtube.com/watch?v=NEaUSP4YerM
seurat <- RunTSNE(seurat, dims = 1:13)
DimPlot(seurat, reduction = "tsne")
colorClusters <- c("#999999","#004949","#009292","#ff6db6","#ffb6db",
"#490092","#006ddb","#b66dff","#6db6ff","#b6dbff",
"#920000","#924900")
DimPlot(seurat, reduction = "tsne",cols = colorClusters)
p1 <- DimPlot(seurat, reduction = "tsne",cols = colorClusters)
p2 <- DimPlot(seurat, reduction = "tsne",group.by = "library_id",cols = colorCellType)
p3 <- DimPlot(seurat, reduction = "tsne",group.by = "phases")
p4 <- FeaturePlot(seurat,reduction = "tsne",features = "nCount_RNA")
CombinePlots(list(p1,p2,p3,p4))
seurat <- RunUMAP(seurat,dims = c(1:13))
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 20:26:19 UMAP embedding parameters a = 0.9922 b = 1.112
## 20:26:19 Read 3283 rows and found 13 numeric columns
## 20:26:19 Using Annoy for neighbor search, n_neighbors = 30
## 20:26:19 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:26:20 Writing NN index file to temp file /tmp/RtmpqflhnI/file2559c47d61965
## 20:26:20 Searching Annoy index using 1 thread, search_k = 3000
## 20:26:21 Annoy recall = 100%
## 20:26:22 Commencing smooth kNN distance calibration using 1 thread
## 20:26:24 Initializing from normalized Laplacian + noise
## 20:26:24 Commencing optimization for 500 epochs, with 129166 positive edges
## 20:26:33 Optimization finished
DimPlot(seurat)
p1 <- DimPlot(seurat,cols = colorClusters)
p2 <- DimPlot(seurat,group.by = "library_id", cols = colorCellType)
p3 <- DimPlot(seurat,group.by = "phases")
p4 <- FeaturePlot(seurat,,features = "nCount_RNA")
CombinePlots(list(p1,p2,p3,p4))
From Seurat tutorial: Seurat can help you find markers that define clusters via differential expression. By default, it identifes positive and negative markers of a single cluster (specified in ident.1), compared to all other cells. FindAllMarkers automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells.
The min.pct argument requires a feature to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a feature to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of features that are unlikely to be highly discriminatory. As another option to speed up these computations, max.cells.per.ident can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significiant and the most highly differentially expressed features will likely still rise to the top.
markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
markers <- markers[which(markers$p_val_adj < 0.05),]
We can draw violin plot for the top 5 markers for each clusters.
for (cluster in unique(markers$cluster)) {
print(cluster)
print(VlnPlot(object = seurat,features = head(markers[which(markers$cluster==cluster),"gene"],9),pt.size = 0.01,cols = colorClusters)) +ggtitle(cluster)
}
## [1] "0"
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
We can create a data frame with count of meta data of interest for each cluster with ddply function from plyr package.
rowCountMeta <- ddply(seurat@meta.data,~seurat_clusters + library_id + phases ,nrow)
head(rowCountMeta)
## seurat_clusters library_id phases V1
## 1 0 LTHSC G1_G0 103
## 2 0 LTHSC G2_M 28
## 3 0 LTHSC S 20
## 4 0 STHSC G1_G0 128
## 5 0 STHSC G2_M 11
## 6 0 STHSC S 7
Then this dataframe can be easily used with ggplot. We can visualize the percentage of each cell type in each cluster:
ggplot(data = rowCountMeta,aes(x = seurat_clusters, fill=library_id,y=V1)) +
geom_bar( stat="identity", position="fill") +
scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+
ylab(label = "clusters")+xlab(label = "") +
coord_flip() +
scale_fill_manual(values = colorCellType)
#theme(legend.title=element_blank()) +
And also the percentage of cell cycle phases:
ggplot(data = rowCountMeta,aes(x = seurat_clusters, fill=phases,y=V1)) +
geom_bar( stat="identity", position="fill") +
scale_y_continuous(name = "Cell type (%)", labels = c(0,25,50,75,100))+
ylab(label = "clusters")+xlab(label = "") +
coord_flip()
To characterize the different cluster we will use gprofiler to tet markers enrichment for terms in different databases/ontologies (GO, KEG pathways, REACTOME…)
#first we need to split the markers data frame in a list
getMarkersFromClust <- function(cluster,markersTable) {
markers <- markersTable[which(markersTable$cluster == cluster),]
return(markers$gene)
}
markersList <- lapply(unique(markers$cluster),getMarkersFromClust,markersTable = markers)
#rename list names because seurat start at 0
names(markersList) <- c(0:(length(markersList)-1))
gprofilerCluster <- gprofiler(markersList,organism = "mmusculus",custom_bg = rownames(seurat),src_filter = c("KEGG","REAC"))
## Warning: Please consider using the new package "gprofiler2". At the moment you are using a deprecated package relying on outdated data.
## More information at https://biit.cs.ut.ee/gprofiler/page/r-new. Feel free to contact us at biit.support@ut.ee for further help.
Now we can rename the clusters
Idents(seurat) <- seurat@meta.data$RNA_snn_res.0.5
new.ident.name = c("rep","diff","np1","np2","pEr","pMk","pNeu","pMo1","pMast","pB","pT","pMo2")
names(x = new.ident.name) <- levels(x = seurat)
seurat <- RenameIdents(object = seurat, new.ident.name)
## We choose an order that roughly follow proportion of cell type assigned with CaSTLe
clustersOrderFig1 <- c("np1","np2","diff","rep","pMk","pT","pB","pEr","pMo1","pMo2","pNeu","pMast")
Idents(seurat) <- factor(Idents(seurat), levels = clustersOrderFig1)
seurat@meta.data$numclust <-Idents(seurat)
DimPlot(seurat)
FeaturePlot function can be used to visualize some specific gene expression in the reduced space.
selectedMarkers <- c("Mllt3","H2afy","Cd34","Lig1","Itga2b",
"Il7r","Cd79a","Klf1","Fn1","Fcgr4","Mpo","Hdc")
getFeaturePlot <- function(seurat,feature) {
p <- FeaturePlot(seurat,features = feature) + NoLegend()
return(p)
}
featureplots <- lapply(selectedMarkers,getFeaturePlot,seurat = seurat)
plot <- cowplot::plot_grid(plotlist = featureplots,nrow=4)
plot
Altough a version 3 in monocle is available we will work with the version 2 (Qiu et al. 2017) [(because I am used to it ;) ). first we need to create a monocle object, we will use directly the scale data of seurat that have been cell cycle regressed. It contains only the higly variable genes detected by seurat, those we will use the same 2000 genes for seurat and monocle data processing.
library("monocle")
packageVersion("monocle")
## [1] '2.14.0'
#seuratForMonocle = subset(seurat, idents = "pB",invert = TRUE)
seuratScaleData <- GetAssayData(object = seurat, slot = "scale.data")[,rownames(seurat@meta.data)]
pd <- new("AnnotatedDataFrame", data = seurat@meta.data)
fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = rownames(seuratScaleData)))
rownames(fd) <- fd$gene_short_name
monocle <- newCellDataSet(seuratScaleData,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.1,
expressionFamily = uninormal())
fData(monocle)$use_for_ordering <- TRUE
We use the DDRTree algorythm of monocle 2 to construct the trajectory.
print("Reducing dimension by DDRTree...")
## [1] "Reducing dimension by DDRTree..."
monocle <- reduceDimension(monocle,
max_components = 2,
reduction_method = 'DDRTree',
norm_method = "none",
pseudo_expr = 0,
verbose = F)
print("Ordering cells...")
## [1] "Ordering cells..."
monocle <- orderCells(monocle)
trajectoryState <- plot_cell_trajectory(monocle)
trajectoryCluster <- plot_cell_trajectory(monocle,color_by = "numclust")
trajectoryCellType <- plot_cell_trajectory(monocle,color_by = "library_id")
trajectoryState
trajectoryCellType
trajectoryCluster
for (c in unique(pData(monocle)[,"numclust"])) {
pData(monocle)$clustOfInterest <- FALSE
pData(monocle)[which(pData(monocle)[,"numclust"] == c),"clustOfInterest"] <- TRUE
print(plot_cell_trajectory(monocle,color_by="clustOfInterest") + ggtitle(c))
}
Qiu, Xiaojie, Qi Mao, Ying Tang, Li Wang, Raghav Chawla, Hannah A Pliner, and Cole Trapnell. 2017. “Reversed Graph Embedding Resolves Complex Single-Cell Trajectories.” Nature Methods 14 (10). Nature Publishing Group: 979.
Rodriguez-Fraticelli, Alejo E, Samuel L Wolock, Caleb S Weinreb, Riccardo Panero, Sachin H Patel, Maja Jankovic, Jianlong Sun, Raffaele A Calogero, Allon M Klein, and Fernando D Camargo. 2018. “Clonal Analysis of Lineage Fate in Native Haematopoiesis.” Nature 553 (7687). Nature Publishing Group: 212.
Stuart, Tim, Andrew Butler, Paul Hoffman, Christoph Hafemeister, Efthymia Papalexi, William M Mauck III, Yuhan Hao, Marlon Stoeckius, Peter Smibert, and Rahul Satija. 2019. “Comprehensive Integration of Single-Cell Data.” Cell. Elsevier.